purrr for quick summary tables - St Koopurrr - Anna Quaglierimutate, across and case_when - Anna Quaglieripurrr for quick summary tables - St KooAdapted from this fantasic Learn to Purrr tutorial
# Load example data
data("mtcars")
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
You can use purrr to loop over all the columns, and output the info into a dataframe.
In this example, I want to see variable type, max character length, and missingness.
library(tidyverse) # Includes purrr
# Create dataframe with some summary info
summary_df <- mtcars %>%
purrr::map_df(
~data.frame(
class = class(.x),
max_char = max(nchar(.x, keepNA = FALSE)),
missing = sum(is.na(.x))
), .id ="col_name"
)
summary_df
## col_name class max_char missing
## 1 mpg numeric 4 0
## 2 cyl numeric 1 0
## 3 disp numeric 5 0
## 4 hp numeric 3 0
## 5 drat numeric 4 0
## 6 wt numeric 5 0
## 7 qsec numeric 5 0
## 8 vs numeric 1 0
## 9 am numeric 1 0
## 10 gear numeric 1 0
## 11 carb numeric 1 0
And because it’s a dataframe, you can use a package like kableExtra to format it for reports.
library(kableExtra)
summary_df %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
| col_name | class | max_char | missing |
|---|---|---|---|
| mpg | numeric | 4 | 0 |
| cyl | numeric | 1 | 0 |
| disp | numeric | 5 | 0 |
| hp | numeric | 3 | 0 |
| drat | numeric | 4 | 0 |
| wt | numeric | 5 | 0 |
| qsec | numeric | 5 | 0 |
| vs | numeric | 1 | 0 |
| am | numeric | 1 | 0 |
| gear | numeric | 1 | 0 |
| carb | numeric | 1 | 0 |
purrr - Anna QuaglieriFor the example I am going to use the flights dataset from the R package nycflights13. I am going to fit linear model that tries to explain the arr_time as a function of dep_time and arr_delay.
library(nycflights13)
library(purrr)
flights %>%
dplyr::select(arr_time, dep_time, arr_delay, carrier) %>%
head()
## # A tibble: 6 x 4
## arr_time dep_time arr_delay carrier
## <int> <int> <dbl> <chr>
## 1 830 517 11 UA
## 2 850 533 20 UA
## 3 923 542 33 AA
## 4 1004 544 -18 B6
## 5 812 554 -25 DL
## 6 740 554 12 UA
To fit the model to the whole dataset we would use the following code:
summary(lm(arr_time ~ dep_time + arr_delay, data = flights))
##
## Call:
## lm(formula = arr_time ~ dep_time + arr_delay, data = flights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2250.59 -68.14 50.97 169.26 2342.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 491.250510 2.045943 240.1 <2e-16 ***
## dep_time 0.757657 0.001446 524.1 <2e-16 ***
## arr_delay -1.633355 0.015815 -103.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 392.8 on 327343 degrees of freedom
## (9430 observations deleted due to missingness)
## Multiple R-squared: 0.4566, Adjusted R-squared: 0.4566
## F-statistic: 1.375e+05 on 2 and 327343 DF, p-value: < 2.2e-16
What if we wanted to fit separate models by carrier?
models <- flights %>%
dplyr::select(arr_time, dep_time, arr_delay, carrier) %>%
tidyr::nest(-carrier) %>%
dplyr::mutate(fit = purrr::map(data, ~ lm(arr_time ~ dep_time + arr_delay, data = .))) %>%
dplyr::mutate(results_fit = purrr::map(fit, function(f) confint(f)))
models
## # A tibble: 16 x 4
## carrier data fit results_fit
## <chr> <list> <list> <list>
## 1 UA <tibble [58,665 × 3]> <lm> <dbl[,2] [3 × 2]>
## 2 AA <tibble [32,729 × 3]> <lm> <dbl[,2] [3 × 2]>
## 3 B6 <tibble [54,635 × 3]> <lm> <dbl[,2] [3 × 2]>
## 4 DL <tibble [48,110 × 3]> <lm> <dbl[,2] [3 × 2]>
## 5 EV <tibble [54,173 × 3]> <lm> <dbl[,2] [3 × 2]>
## 6 MQ <tibble [26,397 × 3]> <lm> <dbl[,2] [3 × 2]>
## 7 US <tibble [20,536 × 3]> <lm> <dbl[,2] [3 × 2]>
## 8 WN <tibble [12,275 × 3]> <lm> <dbl[,2] [3 × 2]>
## 9 VX <tibble [5,162 × 3]> <lm> <dbl[,2] [3 × 2]>
## 10 FL <tibble [3,260 × 3]> <lm> <dbl[,2] [3 × 2]>
## 11 AS <tibble [714 × 3]> <lm> <dbl[,2] [3 × 2]>
## 12 9E <tibble [18,460 × 3]> <lm> <dbl[,2] [3 × 2]>
## 13 F9 <tibble [685 × 3]> <lm> <dbl[,2] [3 × 2]>
## 14 HA <tibble [342 × 3]> <lm> <dbl[,2] [3 × 2]>
## 15 YV <tibble [601 × 3]> <lm> <dbl[,2] [3 × 2]>
## 16 OO <tibble [32 × 3]> <lm> <dbl[,2] [3 × 2]>
expand_models <- models %>%
tidyr::unnest(results_fit, .drop=TRUE)
expand_models
## # A tibble: 48 x 4
## carrier data fit results_fit[,1] [,2]
## <chr> <list> <list> <dbl> <dbl>
## 1 UA <tibble [58,665 × 3]> <lm> 491. 511.
## 2 UA <tibble [58,665 × 3]> <lm> 0.757 0.771
## 3 UA <tibble [58,665 × 3]> <lm> -1.83 -1.66
## 4 AA <tibble [32,729 × 3]> <lm> 434. 455.
## 5 AA <tibble [32,729 × 3]> <lm> 0.822 0.838
## 6 AA <tibble [32,729 × 3]> <lm> -1.02 -0.852
## 7 B6 <tibble [54,635 × 3]> <lm> 843. 870.
## 8 B6 <tibble [54,635 × 3]> <lm> 0.406 0.425
## 9 B6 <tibble [54,635 × 3]> <lm> -2.66 -2.42
## 10 DL <tibble [48,110 × 3]> <lm> 418. 436.
## # … with 38 more rows
expand_models$fit[1]
## [[1]]
##
## Call:
## lm(formula = arr_time ~ dep_time + arr_delay, data = .)
##
## Coefficients:
## (Intercept) dep_time arr_delay
## 501.083 0.764 -1.746
mutate, across and case_when - Anna QuaglieriI found the method below really useful to recode the levels of one or several columns!
library(dplyr)
test_data <- data.frame(area = rep(c("North", "Sud", "East", "West"),times = c(2,3,4,1)),
quality_before = c("High","Low",
"High","Low","Medium",
"Medium","Low","High","High",
"Low"),
quality_after = c("High","High",
"High","Medium","Medium",
"Low","Low","High","High",
"Low"))
test_data %>%
mutate(across(.cols = c(quality_before, quality_after),
~ case_when(
. == "Low" ~ 0,
. == "Medium" ~ 1,
. == "High" ~ 2
)
)
)
## area quality_before quality_after
## 1 North 2 2
## 2 North 0 2
## 3 Sud 2 2
## 4 Sud 0 1
## 5 Sud 1 1
## 6 East 1 0
## 7 East 0 0
## 8 East 2 2
## 9 East 2 2
## 10 West 0 0
Strongly suggest to have a look at other functions and applications to perform column-wise operations https://cran.r-project.org/web/packages/dplyr/vignettes/colwise.html.
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] nycflights13_1.0.1 kableExtra_1.3.1 forcats_0.5.0 stringr_1.4.0
## [5] dplyr_1.0.0 purrr_0.3.4 readr_1.3.1 tidyr_1.1.0
## [9] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0 knitr_1.30
## [13] png_0.1-7 magick_2.4.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.19 haven_2.3.1 lattice_0.20-41
## [5] colorspace_2.0-0 vctrs_0.3.4 generics_0.0.2 viridisLite_0.3.0
## [9] htmltools_0.5.0 yaml_2.2.1 utf8_1.1.4 blob_1.2.1
## [13] rlang_0.4.8 pillar_1.4.6 withr_2.3.0 glue_1.4.2
## [17] DBI_1.1.0 dbplyr_1.4.4 modelr_0.1.8 readxl_1.3.1
## [21] lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
## [25] rvest_0.3.5 evaluate_0.14 fansi_0.4.1 highr_0.8
## [29] broom_0.5.6 Rcpp_1.0.5 backports_1.2.0 scales_1.1.1
## [33] webshot_0.5.2 jsonlite_1.7.1 fs_1.4.2 hms_0.5.3
## [37] digest_0.6.27 stringi_1.5.3 cli_2.1.0 tools_4.0.2
## [41] magrittr_1.5 crayon_1.3.4 pkgconfig_2.0.3 ellipsis_0.3.1
## [45] xml2_1.3.2 reprex_0.3.0 lubridate_1.7.9 assertthat_0.2.1
## [49] rmarkdown_2.3 httr_1.4.2 rstudioapi_0.13 R6_2.5.0
## [53] nlme_3.1-148 compiler_4.0.2
I code in tidyverse universe plus tidylog to output all message corresponding to changes made to vector, dataframe, tibble, etc. Please find tidylog package @ https://github.com/elbersb/tidylog.
library(tidyverse)
# colnames(iris)
# summary(iris)
# str(iris)
library(Hmisc)
iris$Species %>% describe
## .
## n missing distinct
## 150 0 3
##
## Value setosa versicolor virginica
## Frequency 50 50 50
## Proportion 0.333 0.333 0.333
library(tidylog, warn.conflicts = FALSE, quietly = FALSE)
new_dt <- iris %>%
filter(Sepal.Length >= 4.6) %>%
mutate(new_name = case_when(
Species == "versicolor" ~ "V",
Species == "setosa" ~ "S"))
library(tidylog, warn.conflicts = FALSE, quietly = FALSE)
new_dt <- iris %>%
filter(Sepal.Length >= 4.6) %>%
mutate(new_name = case_when(
Species == "versicolor" ~ "Versicolor",
Species == "setosa" ~ "Setosa",
TRUE ~ "Virginica"))
Use iris dataframe as an example.
dt <- head(iris,5)
# dt %>% select("Species", everything(.))
dt %>% relocate("Species", .before = "Sepal.Length")
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3.0 1.4 0.2
## 3 setosa 4.7 3.2 1.3 0.2
## 4 setosa 4.6 3.1 1.5 0.2
## 5 setosa 5.0 3.6 1.4 0.2
# dt %>% relocate(where(is.numeric), .after = where(is.factor))
library(devtools)
# install_github("mrdwab/SOfun", force=TRUE)
library(SOfun)
v <- letters[1:7]
v %>% moveMe(., "a last; b, e, g before d; c first; g after b")
## [1] "c" "b" "g" "e" "d" "f" "a"
hidden_na_dt <- data.frame(
"student" = rep(c("A", "B", "C"),2),
"assignment" = rep(c("A1", "A2"),3),
"mark" = c(NA, runif(n = 5, min = 45, max = 100))
) %>%
filter(!is.na(mark))
hidden_na_dt
## student assignment mark
## 1 B A2 81.66840
## 2 C A1 83.72778
## 3 A A2 75.92982
## 4 B A1 87.81650
## 5 C A2 87.53854
hidden_na_dt %>%
complete(student, nesting(assignment), fill = list(mark = 0))
## # A tibble: 6 x 3
## student assignment mark
## <chr> <chr> <dbl>
## 1 A A1 0
## 2 A A2 75.9
## 3 B A1 87.8
## 4 B A2 81.7
## 5 C A1 83.7
## 6 C A2 87.5
I believe ggplot2 / plotly is relative popular in practice. I also recommend highercharter to visualize timeseries data and/or visNetwork / igraph / ggraph to visualize networks. My focus today is labeling inside a chart, so that I will use ggplot2 to demonstrate.
plt_original <- population %>%
filter(country %in% c("India", "United States of America", "Viet Nam",
"Lao People's Democratic Republic")) %>%
ggplot(aes(x = year, y = population, group = country, color = country))+
geom_line()
plt_original
The purpose of having customized functions is to improve readability and reduce cognitive load for digesting information provided by visualization.
si_num <- function (x) {
if (!is.na(x)) {
if (x < 0){
sign <- "-"
x <- abs(x)
}else{
sign <- ""
x <- x
}
if (x >= 1e12) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-12)] %>% length();
rem <- chrs[seq(1,length(chrs)-11)];
rem <- append(rem, ".", after = len) %>% append("T");
}
if (x >= 1e9) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-9)] %>% length();
rem <- chrs[seq(1,length(chrs)-8)];
rem <- append(rem, ".", after = len) %>% append("B");
}
else if (x >= 1e6) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-6)] %>% length();
rem <- chrs[seq(1,length(chrs)-5)]
rem <- append(rem, ".", after = len) %>% append("M");
}
else if (x >= 1e3) {
chrs <- strsplit(format(x, scientific=12), split="")[[1]];
len <- chrs[seq(1,length(chrs)-3)] %>% length();
rem <- chrs[seq(1,length(chrs)-2)];
rem <- append(rem, ".", after = len) %>% append("K");
}
else {
return(x);
}
return(str_c(sign, paste(rem, sep="", collapse=""), sep = ""));
}
else return(NA);
}
si_vec <- function(x) {
sapply(x, FUN=si_num);
}
library(hrbrthemes)
library(scales)
library(ggrepel)
library(cowplot)
year_series <- unique(population$year)
reminder <- (max(year_series) - min(year_series)) %% 4
new_breaks <- seq(from = min(year_series) + reminder, to = max(year_series), by = 4)
df <- population %>%
filter(country %in% c("India", "United States of America", "Viet Nam",
"Lao People's Democratic Republic"))
df_end <- df %>%
group_by(country) %>%
filter(year == max(year)) %>%
ungroup()
plt_adjust <- df %>%
ggplot(aes(x = year, y = population, group = country, color = country))+
geom_line()+
geom_point()+
geom_text_repel(
data = df_end,
aes(label = str_wrap(country,25)),
nudge_x = 1,
direction = "y",## nudge vertically
size = 3,
hjust = 0, ### left aligned
segment.size = 0.3, ### from here is about the line to connect the data point and text
min.segment.length = 0,
segment.color = "grey60") +
theme_ipsum() +
theme(legend.position = "none") +
scale_y_continuous(labels = si_vec)+
scale_x_continuous(breaks = new_breaks, limits = c(NA, 2020))+
labs(x = "Year", y = "Population", title = "Population Growth between 1995 and 2013")
plt_original
plt_adjust
library(plotly)
plt_plotly <- df %>%
mutate(text = str_c("Country: ", country, "\n",
"Year: ", year, "\n",
"Population: ", si_vec(population))) %>%
ggplot(aes(x = year, y = population, group = country, color = country, text = text))+
geom_line()+
geom_point()+
theme_ipsum() +
theme(legend.position = "none") +
scale_y_continuous(labels = si_vec)+
scale_x_continuous(breaks = new_breaks)+
labs(x = "Year", y = "Population", title = "Population Growth between 1995 and 2013")
ggplotly({plt_plotly}, tooltip = "text")
To be continue, I have coded many interactive plots in shinyapps, and some can be found from https://coffeeandplot.com/apps/. This is a relatively new website we created couples of month ago. Get in touch if you have any suggestions. Please find me @ https://www.linkedin.com/in/ytsong/.
This section describes how to render an RMarkdown report within a simple R conda environment on a Command Line Interface (cluster or linux environment). This could be achieved in two possible ways:
environment.yml file that documents the package dependenciesBoth work but the second way is the recommended one, which will be described below.
Create an environment.yml file, that looks something like
#name of the conda environment
name: HowRYou
#the paths that conda takes a look for packages. Avoid using anaconda channel as we have
#experienced issues using it
channels:
- conda-forge
- bioconda
- defaults
#install following packages in the conda environment
#change according to the packages you are using in your RMardown file.
#The first three are required (are R essentail). You can also change the versions to
# meet the requirements of your analysis
dependencies:
- r-base=3.4.1
- pandoc=1.19
- r-rmarkdown=1.6
- r-hereCreate a conda environment (in this case HowRYou is the conda environment name specified in the environment.yml file. -p flag should point to your miniconda installation path. To find how to install conda, check this
conda env create -p /path/to/miniconda/envs/HowRYou --file environment.ymlActivate this conda environment
conda activate HowRYouRun the RMarkdown file
Rscript -e "rmarkdown::render('HowRYou.Rmd')"
To pass arguments to the Rmd script (in this case two arguments - an input directory location and name of the input vcf file)
Rscript -e "rmarkdown::render('HowRYou.Rmd', params = list(directory = './data', file = 'dummy.txt'))"An example of a rendered script used in the above step # 4
# install.packages(c("tidyverse", "patchwork, palmerpenguins", "devtools"))
# devtools::install_github("bahlolab/ggwehi")
library(tidyverse)
library(patchwork)
# library(ggwehi)
library(palmerpenguins)
plot_1 <- penguins %>%
ggplot(aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point() +
# scale_color_wehi() +
labs(x = "Flipper length (mm)",
y = "Body mass (g)",
color = "Penguin species") +
theme_minimal()
plot_2 <- penguins %>%
ggplot(aes(x = flipper_length_mm, fill = species)) +
geom_histogram(alpha = 0.5, position = "identity") +
# scale_fill_wehi() +
labs(x = "Flipper length (mm)",
y = "Frequency",
fill = "Penguin species") +
theme_minimal()
plot_1 + plot_2 + # plot them side-by-side
plot_layout(guides = "collect") + # this "collects" the legends to the right of the plot
plot_annotation(tag_levels = "a", tag_suffix = ")") # this adds panel labels
plot_1 / plot_2 + # plot them one over the other
plot_layout(guides = "collect") + # this "collects" the legends to the right of the plot
plot_annotation(tag_levels = "a", tag_suffix = ")") # this adds panel labels
# install.packages(c("wordcloud","RColorBrewer))
library(wordcloud) #wordcloud for generating word cloud images
library(RColorBrewer) #RColorBrewer for color palettes
words <- c("RLadies", "Rmarkdown", "tips", "tricks", "script", "rmd", "console", "packages", "share", "4thanniversary", "celebrate",
"RSoftware", "Australia", "Melbourne", "Girls", "Learn","Teach", "Data Structure", "Algorithm", "Visualisation",
"Code", "Data", "ggplot2", "Zoom", "Help", "Text", "RStudio", "programing", "questions", "answers", "Plot", "happy")
freqs <- c(980, 900, 498, 811, 800, 654, 489, 90, 254, 500, 600, 200, 488, 400, 140, 250, 357, 789, 147, 120, 590, 741, 100, 788, 812, 693, 410, 753, 95, 80, 594, 644)
set.seed(3)
wordcloud(words = words,freqs = freqs, scale=c(4,.4), max.words = 1000, random.order=TRUE, random.color=TRUE, colors = brewer.pal(8, "Accent"))
In a code chunk or in an R script, insert a timestamp with ts + shift + tab.
First write ts:
ts
## function (data = NA, start = 1, end = numeric(), frequency = 1,
## deltat = 1, ts.eps = getOption("ts.eps"), class = if (nseries >
## 1) c("mts", "ts", "matrix") else "ts", names = if (!is.null(dimnames(data))) colnames(data) else paste("Series",
## seq(nseries)))
## {
## if (is.data.frame(data))
## data <- data.matrix(data)
## if (is.matrix(data)) {
## nseries <- ncol(data)
## ndata <- nrow(data)
## dimnames(data) <- list(NULL, names)
## }
## else {
## nseries <- 1
## ndata <- length(data)
## }
## if (ndata == 0)
## stop("'ts' object must have one or more observations")
## if (missing(frequency))
## frequency <- 1/deltat
## else if (missing(deltat))
## deltat <- 1/frequency
## if (frequency > 1 && 0 < (d <- abs(frequency - round(frequency))) &&
## d < ts.eps)
## frequency <- round(frequency)
## if (!missing(start))
## start <- as.numeric(start)
## if (length(start) > 1L) {
## start <- start[1L] + (start[2L] - 1)/frequency
## }
## if (!missing(end))
## end <- as.numeric(end)
## if (length(end) > 1L) {
## end <- end[1L] + (end[2L] - 1)/frequency
## }
## if (missing(end))
## end <- start + (ndata - 1)/frequency
## else if (missing(start))
## start <- end - (ndata - 1)/frequency
## if (start > end)
## stop("'start' cannot be after 'end'")
## cycles <- as.numeric((end - start) * frequency)
## if (abs(round(cycles) - cycles) > ts.eps * max(cycles, 1))
## stop("'end' must be a whole number of cycles after 'start'")
## nobs <- floor(cycles + 1.01)
## if (nobs != ndata)
## data <- if (NCOL(data) == 1) {
## if (ndata < nobs)
## rep_len(data, nobs)
## else if (ndata > nobs)
## data[1L:nobs]
## }
## else {
## if (ndata < nobs)
## data[rep_len(1L:ndata, nobs), ]
## else if (ndata > nobs)
## data[1L:nobs, ]
## }
## attr(data, "tsp") <- c(start, end, frequency)
## if (!is.null(class) && class[[1]] != "none")
## attr(data, "class") <- class
## data
## }
## <bytecode: 0x7fa31cb92830>
## <environment: namespace:stats>
Then press shift + tab:
# Wed Nov 11 10:42:54 2020 ------------------------------
A work by R-Ladies Melbourne